﻿/* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_correlate_f64.c
 * Description:  Correlation of floating-point sequences
 *
 * $Date:        10 August 2022
 * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */

/*
 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include "arm_compiler_specific.h"


#include "dsp/filtering_functions.h"

/**
  @ingroup groupFilters
 */

/**
  @addtogroup Corr
  @{
 */

/**
  @brief         Correlation of floating-point sequences.
  @param[in]     pSrcA      points to the first input sequence
  @param[in]     srcALen    length of the first input sequence
  @param[in]     pSrcB      points to the second input sequence
  @param[in]     srcBLen    length of the second input sequence
  @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
 */

ARM_DSP_ATTRIBUTE void arm_correlate_f64(
    const float64_t * pSrcA,
    uint32_t srcALen,
    const float64_t * pSrcB,
    uint32_t srcBLen,
    float64_t * pDst)
{
    const float64_t *pIn1;                               /* InputA pointer */
    const float64_t *pIn2;                               /* InputB pointer */
    float64_t *pOut = pDst;                        /* Output pointer */
    const float64_t *px;                                 /* Intermediate inputA pointer */
    const float64_t *py;                                 /* Intermediate inputB pointer */
    const float64_t *pSrc1;
    float64_t sum;
    uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
    uint32_t j, k, count, blkCnt;                  /* Loop counters */
    uint32_t outBlockSize;                         /* Loop counter */
    int32_t inc = 1;                               /* Destination address modifier */
#if defined(ARM_MATH_NEON) && defined(__aarch64__) && !defined(ARM_MATH_AUTOVECTORIZE)
    float64x2_t sumV,pxV,pyV ;
#endif
    
    /* The algorithm implementation is based on the lengths of the inputs. */
    /* srcB is always made to slide across srcA. */
    /* So srcBLen is always considered as shorter or equal to srcALen */
    /* But CORR(x, y) is reverse of CORR(y, x) */
    /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
    /* and the destination pointer modifier, inc is set to -1 */
    /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
    /* But to improve the performance,
     * we assume zeroes in the output instead of zero padding either of the the inputs*/
    /* If srcALen > srcBLen,
     * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
    /* If srcALen < srcBLen,
     * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
    if (srcALen >= srcBLen)
    {
        /* Initialization of inputA pointer */
        pIn1 = pSrcA;
        
        /* Initialization of inputB pointer */
        pIn2 = pSrcB;
        
        /* Number of output samples is calculated */
        outBlockSize = (2U * srcALen) - 1U;
        
        /* When srcALen > srcBLen, zero padding has to be done to srcB
         * to make their lengths equal.
         * Instead, (outBlockSize - (srcALen + srcBLen - 1))
         * number of output samples are made zero */
        j = outBlockSize - (srcALen + (srcBLen - 1U));
        
        /* Updating the pointer position to non zero value */
        pOut += j;
    }
    else
    {
        /* Initialization of inputA pointer */
        pIn1 = pSrcB;
        
        /* Initialization of inputB pointer */
        pIn2 = pSrcA;
        
        /* srcBLen is always considered as shorter or equal to srcALen */
        j = srcBLen;
        srcBLen = srcALen;
        srcALen = j;
        
        /* CORR(x, y) = Reverse order(CORR(y, x)) */
        /* Hence set the destination pointer to point to the last output sample */
        pOut = pDst + ((srcALen + srcBLen) - 2U);
        
        /* Destination address modifier is set to -1 */
        inc = -1;
    }
    
    /* The function is internally
     * divided into three stages according to the number of multiplications that has to be
     * taken place between inputA samples and inputB samples. In the first stage of the
     * algorithm, the multiplications increase by one for every iteration.
     * In the second stage of the algorithm, srcBLen number of multiplications are done.
     * In the third stage of the algorithm, the multiplications decrease by one
     * for every iteration. */
    
    /* The algorithm is implemented in three stages.
     The loop counters of each stage is initiated here. */
    blockSize1 = srcBLen - 1U;
    blockSize2 = srcALen - (srcBLen - 1U);
    blockSize3 = blockSize1;
    
    /* --------------------------
     * Initializations of stage1
     * -------------------------*/
    
    /* sum = x[0] * y[srcBlen - 1]
     * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
     * ....
     * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
     */
    
    /* In this stage the MAC operations are increased by 1 for every iteration.
     The count variable holds the number of MAC operations performed */
    count = 1U;
    
    /* Working pointer of inputA */
    px = pIn1;
    
    /* Working pointer of inputB */
    pSrc1 = pIn2 + (srcBLen - 1U);
    py = pSrc1;
    
    /* ------------------------
     * Stage1 process
     * ----------------------*/
    
    /* The first stage starts here */
    while (blockSize1 > 0U)
    {
        /* Accumulator is made zero for every iteration */
        sum = 0.;
#if defined(ARM_MATH_NEON) && defined(__aarch64__) && !defined(ARM_MATH_AUTOVECTORIZE)
        sumV = vdupq_n_f64(0.0);
        k = count >> 1U ;
        
        while(k > 0U)
        {
            pxV = vld1q_f64(px);
            pyV = vld1q_f64(py);
            sumV = vmlaq_f64(sumV, pxV, pyV);
            px+=2;
            py+=2;
            k--;
        }
        sum =vaddvq_f64(sumV);
        k = count & 1 ;
#else
        /* Initialize k with number of samples */
        k = count;
#endif
        while (k > 0U)
        {
            /* Perform the multiply-accumulate */
            /* x[0] * y[srcBLen - 1] */
            sum += *px++ * *py++;
            
            /* Decrement loop counter */
            k--;
        }
        
        /* Store the result in the accumulator in the destination buffer. */
        
        *pOut = sum;
        /* Destination pointer is updated according to the address modifier, inc */
        pOut += inc;
        
        /* Update the inputA and inputB pointers for next MAC calculation */
        py = pSrc1 - count;
        px = pIn1;
        
        /* Increment MAC count */
        count++;
        
        /* Decrement loop counter */
        blockSize1--;
        
    }
    
    /* --------------------------
     * Initializations of stage2
     * ------------------------*/
    
    /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
     * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen]   * y[srcBLen-1]
     * ....
     * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
     */
    
    /* Working pointer of inputA */
    px = pIn1;
    
    /* Working pointer of inputB */
    py = pIn2;
    
    /* count is index by which the pointer pIn1 to be incremented */
    count = 0U;
    
    /* -------------------
     * Stage2 process
     * ------------------*/
    
    /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
     * So, to loop unroll over blockSize2,
     * srcBLen should be greater than or equal to 4 */
    if (srcBLen >= 4U)
    {
        /* Initialize blkCnt with number of samples */
        blkCnt = blockSize2;
        
        while (blkCnt > 0U)
        {
            /* Accumulator is made zero for every iteration */
            sum = 0.;
#if defined(ARM_MATH_NEON) && defined(__aarch64__) && !defined(ARM_MATH_AUTOVECTORIZE)
            sumV = vdupq_n_f64(0.0);
            k = srcBLen >> 1U ;
            while(k > 0U)
            {
                pxV = vld1q_f64(px);
                pyV = vld1q_f64(py);
                sumV = vmlaq_f64(sumV, pxV, pyV);
                px+=2;
                py+=2;
                k--;
            }
            sum =vaddvq_f64(sumV);
            k = srcBLen & 1 ;
            
#else
            
            /* Initialize blkCnt with number of samples */
            k = srcBLen;
#endif
            while (k > 0U)
            {
                /* Perform the multiply-accumulate */
                sum += *px++ * *py++;
                
                /* Decrement the loop counter */
                k--;
            }
            
            /* Store the result in the accumulator in the destination buffer. */
            *pOut = sum;
            
            /* Destination pointer is updated according to the address modifier, inc */
            pOut += inc;
            
            /* Increment the pointer pIn1 index, count by 1 */
            count++;
            
            /* Update the inputA and inputB pointers for next MAC calculation */
            px = pIn1 + count;
            py = pIn2;
            
            /* Decrement the loop counter */
            blkCnt--;
        }
    }
    else
    {
        /* If the srcBLen is not a multiple of 4,
         * the blockSize2 loop cannot be unrolled by 4 */
        blkCnt = blockSize2;
        
        while (blkCnt > 0U)
        {
            /* Accumulator is made zero for every iteration */
            sum = 0.;
#if defined(ARM_MATH_NEON) && defined(__aarch64__) && !defined(ARM_MATH_AUTOVECTORIZE)
            sumV = vdupq_n_f64(0.0);
            k = srcBLen >> 1U ;
            while(k > 0U)
            {
                pxV = vld1q_f64(px);
                pyV = vld1q_f64(py);
                sumV = vmlaq_f64(sumV, pxV, pyV);
                px+=2;
                py+=2;
                k--;
            }
            sum =vaddvq_f64(sumV);
            k = srcBLen & 1 ;
            
#else
            
            /* Loop over srcBLen */
            k = srcBLen;
#endif
            while (k > 0U)
            {
                /* Perform the multiply-accumulate */
                sum += *px++ * *py++;
                
                /* Decrement the loop counter */
                k--;
            }
            
            /* Store the result in the accumulator in the destination buffer. */
            *pOut = sum;
            /* Destination pointer is updated according to the address modifier, inc */
            pOut += inc;
            
            /* Increment the pointer pIn1 index, count by 1 */
            count++;
            
            /* Update the inputA and inputB pointers for next MAC calculation */
            px = pIn1 + count;
            py = pIn2;
            
            /* Decrement the loop counter */
            blkCnt--;
        }
    }
    
    
    /* --------------------------
     * Initializations of stage3
     * -------------------------*/
    
    /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
     * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
     * ....
     * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
     * sum +=  x[srcALen-1] * y[0]
     */
    
    /* In this stage the MAC operations are decreased by 1 for every iteration.
     The count variable holds the number of MAC operations performed */
    count = srcBLen - 1U;
    
    /* Working pointer of inputA */
    pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
    px = pSrc1;
    
    /* Working pointer of inputB */
    py = pIn2;
    
    /* -------------------
     * Stage3 process
     * ------------------*/
    
    while (blockSize3 > 0U)
    {
        /* Accumulator is made zero for every iteration */
        sum = 0.;
#if defined(ARM_MATH_NEON) && defined(__aarch64__) && !defined(ARM_MATH_AUTOVECTORIZE)
        sumV = vdupq_n_f64(0.0);
        k = count >> 1U ;
        
        while(k > 0U)
        {
            pxV = vld1q_f64(px);
            pyV = vld1q_f64(py);
            sumV = vmlaq_f64(sumV, pxV, pyV);
            px+=2;
            py+=2;
            k--;
        }
        sum =vaddvq_f64(sumV);
        k = count & 1 ;
#else
        
        /* Initialize blkCnt with number of samples */
        k = count;
#endif
        while (k > 0U)
        {
            /* Perform the multiply-accumulate */
            sum += *px++ * *py++;
            
            /* Decrement loop counter */
            k--;
        }
        
        /* Store the result in the accumulator in the destination buffer. */
        *pOut = sum;
        /* Destination pointer is updated according to the address modifier, inc */
        pOut += inc;
        
        /* Update the inputA and inputB pointers for next MAC calculation */
        px = ++pSrc1;
        py = pIn2;
        
        /* Decrement MAC count */
        count--;
        
        /* Decrement the loop counter */
        blockSize3--;
    }
}

/**
  @} end of Corr group
 */
